Motif/TF assessed
# Load in real data
samples_full = list.files('/Volumes/data/projects/nei/hufnagel/iPSC_RPE_ATAC_Seq/fimo_motifs/', pattern = '*dat.gz', full.names = T)
fimo_data <- list()
for (i in samples_full){
sample <- str_split(str_split(i, '/')[[1]][10], '\\.')[[1]][1]
input <- read_tsv(i)
input <- input %>% mutate(sequence_name = as.character(sequence_name)) %>% mutate(sample = sample)
fimo_data[[sample]] <- input
}
sample_motifs = bind_rows(fimo_data) %>% mutate(sample=as.factor(sample))
#sample_motifs %>% group_by(sample, motif_alt_id) %>% summarise(Count=n()) %>% arrange(-Count)
tf_motif <- sample_motifs %>% select('TF' = `# motif_id`, motif_alt_id) %>% unique()
tf_motif %>% DT::datatable()
# merge sample counts with bootstrap counts
all <- bind_rows(sample_motifs %>% group_by(sample, motif_alt_id) %>% summarise(Count=n()) %>% ungroup() %>% mutate(bootstrap = 'real'),
bootstrap_counts)
# motif ggridges plotter
plotter <- function(motif, scale=TRUE) {
if (scale == TRUE){
all <- all %>% filter(motif_alt_id == !!motif) %>%
group_by(sample, motif_alt_id) %>%
mutate(`Z score` = scale(Count)) %>%
ungroup()
all %>%
mutate(sample = factor(sample, levels=c('iPSC_IIi_9','iPSC_IIJ_10','iPSC_IIK_11','iPSC_IIL_12','RFP_IIE_1','RFP_IIF_2','RFP_IIG_3','RFP_IIH_4','GFP_IIE_5','GFP_IIF_6','GFP_IIG_7','GFP_IIH_8'))) %>%
ggplot(aes(x=`Z score`, y=sample)) +
geom_density_ridges() +
geom_point(data = all %>%
filter(bootstrap == 'real', motif_alt_id == !!motif) %>%
ungroup(), aes(x=`Z score`,y=sample), colour='blue', size=2, alpha=0.5) +
scale_y_discrete(expand = c(0.01, 0)) +
theme_ridges() +
ggtitle(paste0(motif, ' (',
tf_motif %>% filter(motif_alt_id == !!motif) %>% pull(TF),
')'))
}
else {
all %>% filter(motif_alt_id == !!motif) %>%
mutate(sample = factor(sample, levels=c('iPSC_IIi_9','iPSC_IIJ_10','iPSC_IIK_11','iPSC_IIL_12','RFP_IIE_1','RFP_IIF_2','RFP_IIG_3','RFP_IIH_4','GFP_IIE_5','GFP_IIF_6','GFP_IIG_7','GFP_IIH_8'))) %>%
ggplot(aes(x=Count, y=sample)) +
geom_density_ridges() +
geom_point(data = sample_motifs %>%
filter(motif_alt_id == !!motif) %>%
group_by(sample, motif_alt_id) %>%
summarise(Count=n()) %>% ungroup(), aes(x=Count,y=sample), colour='blue', size=2, alpha=0.5) +
scale_y_discrete(expand = c(0.01, 0)) +
theme_ridges() +
ggtitle(paste0(motif, ' (',
tf_motif %>% filter(motif_alt_id == !!motif) %>% pull(TF),
')'))
}
}
DESeq2-based analysis
I realized that motifs by samples is similar to genes by samples
We are dropping RFP II E because from the plots, seems rarely enriched in anything
cts <- all %>% filter(bootstrap=='real') %>% spread(sample, Count) %>% select(-bootstrap, -TF) %>% data.frame()
row.names(cts) <- cts$motif_alt_id
cts %>% head()
cts <- cts[,-1]
coldata <- all %>% select(sample) %>% unique() %>% filter(sample!='RFP_IIE_1') %>% mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
grepl('RFP', sample) ~ 'RFP',
TRUE ~ 'iPSC'))
dds <- DESeqDataSetFromMatrix(countData = cts %>% select(one_of(as.character(coldata$sample)) ),
colData = coldata,
design = ~ Type)
some variables in design formula are characters, converting to factors
dds$Type <- relevel(dds$Type, ref='RFP')
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
function: y = a/x + b, and a local regression fit was automatically substituted.
specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates
fitting model and testing
res <- results(DESeq(dds))
using pre-existing size factors
estimating dispersions
found already estimated dispersions, replacing these
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
function: y = a/x + b, and a local regression fit was automatically substituted.
specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates
fitting model and testing
resLFC <- lfcShrink(dds, coef='Type_GFP_vs_RFP', type='apeglm')
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
bioRxiv. https://doi.org/10.1101/303255
plotMA(resLFC)

plot(hist(resLFC$pvalue))


Table of results
The Z score is the number of standard deviation of motifs found in the sample peaks above the random set of peaks
results <- resLFC %>%
data.frame() %>%
rownames_to_column('motif_alt_id') %>%
left_join(tf_motif) %>%
arrange(pvalue) %>%
data.frame() %>%
left_join(Z_scoring %>% mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
grepl('RFP', sample) ~ 'RFP',
TRUE ~ 'iPSC')) %>%
filter(Type=='GFP') %>% group_by(motif_alt_id) %>% summarise(`Z score` = mean(`Z score`))) %>%
left_join(gfp_rfp %>% mutate(TF=Gene, log2FC_GFP_RFP_RNASeq = log2FoldChange) %>% select(TF, log2FC_GFP_RFP_RNASeq))
Joining, by = "motif_alt_id"
Joining, by = "motif_alt_id"
Joining, by = "TF"
results <- resLFC %>%
data.frame() %>%
rownames_to_column('motif_alt_id') %>%
left_join(tf_motif) %>%
arrange(pvalue) %>%
data.frame() %>%
left_join(Z_scoring %>% mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
grepl('RFP', sample) ~ 'RFP',
TRUE ~ 'iPSC')) %>%
filter(Type=='GFP') %>% group_by(motif_alt_id) %>% summarise(`Z score` = mean(`Z score`))) %>%
left_join(gfp_rfp %>% mutate(TF=Gene, log2FC_GFP_RFP_RNASeq = log2FoldChange) %>% select(TF, log2FC_GFP_RFP_RNASeq))
Joining, by = "motif_alt_id"
Joining, by = "motif_alt_id"
Joining, by = "TF"
results %>% filter((`Z score`) > 2.5) %>% arrange(pvalue) %>% DT::datatable()
Volcano
volcano_maker <- function(df, title){
df$Class <- 'Not significant'
df$Class[df$padj < 0.01 & df$log2FoldChange > 0.4 & (df$`Z score`) > 2.5] <- "FDR < 0.01 &\nabs(Z score) > 2.5"
df$Class[df$padj < 0.01 & df$log2FoldChange < -0.4 & (df$`Z score`) > 2.5] <- "FDR < 0.01 &\nabs(Z score) > 2.5"
df$Class <- factor(df$Class, levels=c('Not significant', "FDR < 0.01 &\nabs(Z score) > 2.5"))
plot <- ggplot(data=df,aes(x=log2FoldChange,y=-log10(pvalue))) +
geom_point(aes(colour=Class, size=`Z score`), alpha=0.5) +
scale_colour_manual(values=c("gray","darkred")) +
geom_text_repel(data=df %>% filter((padj < 0.01 & log2FoldChange > 0.4 & `Z score` > 2.5) | (padj < 0.01 & log2FoldChange < -0.4 & `Z score` > 2.5)),
aes(label=TF)) +
# geom_vline(aes(xintercept=-0.5),linetype="dotted") +
# geom_vline(aes(xintercept=0.5),linetype="dotted") +
scale_x_continuous(breaks=c(-1,-0.5,0,0.5,1)) +
ggtitle(title) + theme_minimal()
return(plot)
}
volcano_maker(results, 'GFP vs RFP TFBS motif counts')

Plots of the top motifs different between GFP and RFP
RFP II E is plotted here, but not used in the differential analysis above
plots <- list()
for (i in results %>% filter((`Z score`) > 2) %>% arrange(pvalue) %>% head(n=12) %>% pull(motif_alt_id)){
plots[[i]] <- plotter(i, scale=T)
}
plot_grid(plotlist = plots, ncol=3)
Picking joint bandwidth of 0.247
Picking joint bandwidth of 0.26
Picking joint bandwidth of 0.26
Picking joint bandwidth of 0.263
Picking joint bandwidth of 0.252
Picking joint bandwidth of 0.252
Picking joint bandwidth of 0.252
Picking joint bandwidth of 0.268
Picking joint bandwidth of 0.274
Picking joint bandwidth of 0.268
Picking joint bandwidth of 0.258
Picking joint bandwidth of 0.28

# Gene <-> motif matching
# load gtf to map transcript to gene_name
gtf <- read_tsv("/Volumes/data/genomes/1000G_phase2_GRCh37/gencode.v28lift37.metadata.HGNC.gz", col_names = c('Transcript','Gene'))
# gtf <- gtf %>% rowwise() %>% mutate(transcript = ((str_split(V9, ';')[[1]] %>% str_extract('transcript_id.*') %>% na.omit())[1] %>% str_split(., '\"'))[[1]][2],
# gene = ((str_split(V9, ';')[[1]] %>% str_extract('gene_name.*') %>% na.omit())[1] %>% str_split(., '\"'))[[1]][2])
## Sample data
### Closest TSS for OTX2 (M5700_1.02)
# samples_full = list.files('/Volumes/data/projects/nei/hufnagel/iPSC_RPE_ATAC_Seq/closest_TSS_motifs/',
# pattern = '*dat.gz', full.names = T)
#
# closestTSS_data <- list()
# for (i in samples_full){
# sample <- str_split(str_split(i, '/')[[1]][10], '\\.')[[1]][1]
# # input <- read_tsv(i, col_names = c('sequence_name', 'start', 'end', 'motif', 'fimo_pvalue', 'strand', 'tss_seq','tss_start','tss_end', 'transcript', 'blank','tss_strand','coord1','coord2','blank2','exon_num','size','exon_pos','distance'))
# input <- fread(paste('gzcat', i, '| grep M5700_1'))
# colnames(input) <- c('sequence_name', 'start', 'end', 'motif', 'fimo_pvalue', 'strand', 'tss_seq','tss_start','tss_end', 'transcript', 'blank','tss_strand','coord1','coord2','blank2','exon_num','size','exon_pos','distance')
# input <- input %>% mutate(sequence_name = as.character(sequence_name),
# tss_seq = as.character(tss_seq),
# coord1=as.numeric(coord1),
# coord2=as.numeric(coord2),
# blank2 = as.character(blank2),
# exon_num=as.integer(exon_num),
# size=as.numeric(size)) %>% mutate(sample = sample)
# closestTSS_data[[sample]] <- input
# }
# sample_closestTSS = bind_rows(closestTSS_data) %>% mutate(sample=as.factor(sample)) %>% rowwise() %>% mutate(Transcript = str_split(transcript, '_')[[1]][1])
#
# both <- left_join(sample_closestTSS, gtf) %>%
# filter(!is.na(Gene)) %>%
# mutate(motif_loc = paste(sequence_name, start, end, sep='_'))
#
# both %>%
# # remove any TSS over 500kb away
# filter(distance < 500000) %>%
# # one gene per motif
# group_by(motif_loc, sample, Gene) %>%
# top_n(1, distance) %>%
# ungroup() %>%
# # add up to two genes (total) per motif
# group_by(motif_loc, sample) %>%
# top_n(2, distance) %>%
# ungroup() %>%
# # arrange by genes most linked to motif
# group_by(Gene, sample) %>%
# summarise(Count=n(), paste(motif_loc, collapse=', ')) %>%
# arrange(-Count)
# What genes have more PAX6 'associated' motifs compared from GFP to RFP
# enriched_genes <- both %>%
# # only keep one gene per motif
# group_by(motif_loc, sample, Gene) %>%
# top_n(1, distance) %>%
# ungroup() %>%
# # keep up to two genes per motif
# group_by(motif_loc, sample) %>%
# top_n(2, distance) %>%
# ungroup() %>%
# # arrange by genes most linked to motif
# group_by(Gene, sample) %>%
# summarise(Count=n(), paste(motif_loc, collapse=', ')) %>%
# ungroup() %>%
# # collapse to GFP/RFP/iPSC
# mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
# grepl('RFP', sample) ~ 'RFP',
# TRUE ~ 'iPSC')) %>%
# group_by(Gene, Type) %>%
# summarise(Total=sum(Count)) %>%
# arrange(-Total) %>%
# spread(Gene, Total) %>% t()
#
# colnames(enriched_genes) <- enriched_genes[1,]
# enriched_genes <- enriched_genes[-1,] %>% data.frame() %>% rownames_to_column('Gene') %>% mutate(GFP = as.numeric(GFP), iPSC = as.numeric(iPSC), RFP = as.numeric((RFP)))
#
# enriched_genes[is.na(enriched_genes)] <- 0
#
# enriched_genes %>% mutate(`deltaGFP <-> RFP` = GFP - RFP) %>% arrange(-`deltaGFP <-> RFP`, GFP) %>% head(1000) %>% DT::datatable(rownames = F)
## does PAX6 regulate PAX6?
#Yes, yes it does.
#GFP specific!
# both %>%
# # only keep one gene per motif
# group_by(motif_loc, sample, Gene) %>%
# top_n(1, distance) %>%
# ungroup() %>%
# # keep up to two genes per motif
# group_by(motif_loc, sample) %>%
# top_n(2, distance) %>%
# ungroup() %>%
# # arrange by genes most linked to motif
# group_by(Gene, sample) %>%
# summarise(Count=n(), `Motif Locations` = paste(motif_loc, collapse=', ')) %>%
# arrange(-Count) %>%
# filter(Gene=='PAX6')
```
---
title: Motif / TFBS Analysis
author: David McGaughey
date: '`r format(Sys.Date(), "%Y-%m-%d")`'
output: 
  html_notebook:
    theme: flatly
    toc: true
    code_folding: hide
---

# Workflow to ID motifs and match to genes

(Full implementation in Snakefile)

1. Download TF motifs from cisbp
2. Only keep TF which are abs(logFc) > 1 between GFP/RPE and iPSC (~1200)
3. Check for motifs in narrow ATAC-seq peaks with fimo
4. Identify closest 2 genes (under 500k bp) to each motif
5. Bootstrap steps 2 and 3 250 times each to get background rate

```{r, message=F, warning=F, results='hide'}
# Load Libraries without printing any warnings or messages
library(tidyverse)
library(ggridges)
library(cowplot)
library(DESeq2)
library(ggrepel)
```

# Motif/TF assessed
```{r, results = 'hide'}
# Load in real data
samples_full = list.files('/Volumes/data/projects/nei/hufnagel/iPSC_RPE_ATAC_Seq/fimo_motifs/', pattern = '*dat.gz', full.names = T)

fimo_data <- list()
for (i in samples_full){
  sample <- str_split(str_split(i, '/')[[1]][10], '\\.')[[1]][1]
  input <- read_tsv(i)
  input <- input %>% mutate(sequence_name = as.character(sequence_name)) %>% mutate(sample = sample)
  fimo_data[[sample]] <- input
}
sample_motifs = bind_rows(fimo_data) %>% mutate(sample=as.factor(sample))

#sample_motifs %>% group_by(sample, motif_alt_id) %>% summarise(Count=n()) %>% arrange(-Count)

tf_motif <- sample_motifs %>% select('TF' = `# motif_id`, motif_alt_id) %>% unique()
tf_motif %>% DT::datatable()
```


```{r, warning=F, message=F, echo=F, results='hide'}
# Load in bootstraps
bootstrap_stats = list.files('/Volumes/data/projects/nei/hufnagel/iPSC_RPE_ATAC_Seq/fimo_motifs/bootstrapping_stats/', pattern = '*dat.gz', full.names = T)
#samples_full = list.files('~/Desktop/fimo_motifs/bootstrapping/', pattern = '*dat.gz', full.names = T)

# big difference here vs above
# not saving full file, as each bootstrap is ~200mb COMPRESSED
# just saving the stats
# right now, counts by motif/sample
fimo_data <- list()
for (i in bootstrap_stats){
  sample <- str_split(str_split(i, '/')[[1]][11], '\\.')[[1]][1]
  bootstrap_num <- str_split(str_split(str_split(i, '/')[[1]][11], '\\.')[[1]][2], '_')[[1]][2]
  sample_boot <- paste0(sample,'_',bootstrap_num)
  stats <- read_tsv(i) %>% mutate(sample = as.factor(sample), bootstrap = bootstrap_num)
  fimo_data[[sample_boot]] <- stats
}
1
bootstrap_counts = bind_rows(fimo_data) %>% mutate(sample=as.factor(sample)) %>% left_join(tf_motif)
```

```{r}
# merge sample counts with bootstrap counts
all <- bind_rows(sample_motifs %>% group_by(sample, motif_alt_id) %>% summarise(Count=n()) %>% ungroup() %>% mutate(bootstrap = 'real'),
                 bootstrap_counts)

# motif ggridges plotter

plotter <- function(motif, scale=TRUE) {
  if (scale == TRUE){
    all <- all %>% filter(motif_alt_id == !!motif) %>% 
      group_by(sample, motif_alt_id) %>% 
      mutate(`Z score` = scale(Count)) %>% 
      ungroup() 
    all %>% 
      mutate(sample = factor(sample, levels=c('iPSC_IIi_9','iPSC_IIJ_10','iPSC_IIK_11','iPSC_IIL_12','RFP_IIE_1','RFP_IIF_2','RFP_IIG_3','RFP_IIH_4','GFP_IIE_5','GFP_IIF_6','GFP_IIG_7','GFP_IIH_8'))) %>% 
      ggplot(aes(x=`Z score`, y=sample)) + 
      geom_density_ridges() +
      geom_point(data = all %>% 
                   filter(bootstrap == 'real', motif_alt_id == !!motif) %>% 
                   ungroup(), aes(x=`Z score`,y=sample), colour='blue', size=2, alpha=0.5) +
      scale_y_discrete(expand = c(0.01, 0)) +
      theme_ridges() + 
      ggtitle(paste0(motif, ' (', 
                     tf_motif %>% filter(motif_alt_id == !!motif) %>% pull(TF),
                     ')'))
  }
  else {
    all %>% filter(motif_alt_id == !!motif) %>% 
      mutate(sample = factor(sample, levels=c('iPSC_IIi_9','iPSC_IIJ_10','iPSC_IIK_11','iPSC_IIL_12','RFP_IIE_1','RFP_IIF_2','RFP_IIG_3','RFP_IIH_4','GFP_IIE_5','GFP_IIF_6','GFP_IIG_7','GFP_IIH_8'))) %>% 
      ggplot(aes(x=Count, y=sample)) + 
      geom_density_ridges() +
      geom_point(data = sample_motifs %>% 
                   filter(motif_alt_id == !!motif) %>% 
                   group_by(sample, motif_alt_id) %>%
                   summarise(Count=n()) %>% ungroup(), aes(x=Count,y=sample), colour='blue', size=2, alpha=0.5) +
      scale_y_discrete(expand = c(0.01, 0)) +
      theme_ridges() + 
      ggtitle(paste0(motif, ' (', 
                     tf_motif %>% filter(motif_alt_id == !!motif) %>% pull(TF),
                     ')'))
  }
}
```
# Plot enrichment  specific motif/TFBS

## TET1
```{r}
plotter('M0610_1.02')
```

## DNMT1
```{r}
plotter('M0609_1.02')
```

## PAX6
```{r}
plotter('M6410_1.02')
```

## SOX10
```{r}
plotter('M6470_1.02')
```

## MITF
```{r}
plotter('M6345_1.02')
```

## HES1
```{r}
plotter('M6271_1.02')
```

## SIX3
```{r}
plotter('M1016_1.02')
```

## SIX2
```{r}
plotter('M0952_1.02')
```

# Distribution of TFBS/motif enrichment
## Fold Change
```{r}
delta_motif_FC <- all %>% mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
                                                    grepl('RFP', sample) ~ 'RFP',
                                                    TRUE ~ 'iPSC'),
                                   Boot = case_when(bootstrap == 'real' ~ 'real',
                                                    TRUE ~ 'boot')) %>% 
  group_by(Type, motif_alt_id, Boot) %>% 
  mutate(`Z score` = scale(Count)) %>% 
  summarise(Mean=mean(Count)) %>% 
  spread(Boot, Mean) %>%
  ungroup() %>% 
  mutate(FC = real / boot, Type = factor(Type, levels=c('iPSC','RFP','GFP'))) %>% 
  arrange(-FC) %>% 
  left_join(tf_motif) %>% 
  select(-boot, -real) 

delta_motif_FC %>% ggplot(aes(y=Type, x=log2(FC))) + geom_density_ridges() + theme_ridges() 

delta_motif_FC %>% ggplot(aes(y=Type, x=FC)) + geom_density_ridges() + theme_ridges() 
```



```{r}
# calculate z scores for all
Z_scoring <- all %>% 
  # collapse by Type, motif, and whether bootstrap or real
  group_by(sample, motif_alt_id) %>% 
  mutate(`Z score` = scale(Count)) %>% 
  filter(bootstrap == 'real') %>% 
  select(-bootstrap, -TF) %>% 
  left_join(tf_motif)

Z_scoring %>% head()
```
# Distribution of *relative* TFBS/motif enrichment *between* GFP <-> RFP
```{r}
FC_motif_FC <- all %>% mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
                                                    grepl('RFP', sample) ~ 'RFP',
                                                    TRUE ~ 'iPSC'),
                                   Boot = case_when(bootstrap == 'real' ~ 'real',
                                                    TRUE ~ 'boot')) %>% 
  # collapse by Type, motif, and whether bootstrap or real
  group_by(Type, motif_alt_id, Boot) %>% 
  summarise(Mean=mean(Count)) %>% 
  # go wide
  spread(Boot, Mean) %>%
  # so you can calculate FC
  mutate(FC = real / boot) %>% 
  arrange(-FC) %>% 
  # add motif name
  left_join(tf_motif) %>% 
  select(-boot, -real) %>%
  # spread again to allow for GFP vs RFP or iPSC comparisons
  spread(Type, FC) %>% 
  filter(GFP > 1.2) %>% 
  mutate(`FC^2 GFP <-> iPSC` = GFP / iPSC, `FC^2 GFP <-> RFP` = GFP / RFP) %>% 
  arrange(-`FC^2 GFP <-> RFP`) 

plot1 <- FC_motif_FC %>% 
  ggplot(aes(x=`FC^2 GFP <-> RFP`)) + 
  geom_density(fill='gray') + 
  theme_minimal() + 
  geom_vline(xintercept = 1.2, color='blue')
plot1
```

# Distribution of *relative* TFBS/motif enrichment *between* GFP <-> iPSC
```{r}
plot2 <- FC_motif_FC %>% 
  ggplot(aes(x=`FC^2 GFP <-> iPSC`)) + 
  geom_density(fill='gray') + 
  theme_minimal()+ 
  geom_vline(xintercept = 1.2, color = 'blue')
plot2 
```

## Both together
```{r}
plot_grid(plot1 + coord_cartesian(xlim=c(0.5,5.5)), plot2 + coord_cartesian(xlim=c(0.5,5.5)), nrow = 2, align = 'hv', axis='tblr')
```

# DESeq2-based analysis
I realized that motifs by samples is similar to genes by samples

We are dropping RFP II E because from the plots, seems rarely enriched in anything
```{r}
cts <- all %>% filter(bootstrap=='real') %>% spread(sample, Count) %>% select(-bootstrap, -TF) %>% data.frame()
row.names(cts) <- cts$motif_alt_id
cts %>% head()
cts <- cts[,-1]

coldata <- all %>% select(sample) %>% unique() %>% filter(sample!='RFP_IIE_1') %>% mutate(Type =  case_when(grepl('GFP', sample) ~ 'GFP',
                                                                            grepl('RFP', sample) ~ 'RFP',
                                                                            TRUE ~ 'iPSC'))

dds <- DESeqDataSetFromMatrix(countData = cts %>% select(one_of(as.character(coldata$sample)) ),
                              colData = coldata,
                              design = ~ Type)
dds$Type <- relevel(dds$Type, ref='RFP')
dds <- DESeq(dds)

res <- results(DESeq(dds))
resLFC <- lfcShrink(dds, coef='Type_GFP_vs_RFP', type='apeglm')
plotMA(resLFC)
plot(hist(resLFC$pvalue))

```

```{r}
# Load in RNA-seq data
gfp_rfp <- read_csv('~/git/ipsc_rpe_RNA-seq/data/GFP_vs_RFP.results.csv')
rpe_ipsc <- read_csv('~/git/ipsc_rpe_RNA-seq/data/RPE_vs_iPSC.results.csv')
gfp_rfp %>% head()
```

## Table of results
The `Z score` is the number of standard deviation of motifs found in the sample peaks above the random set of peaks
```{r}
results <- resLFC %>% 
  data.frame() %>% 
  rownames_to_column('motif_alt_id') %>% 
  left_join(tf_motif) %>% 
  arrange(pvalue) %>% 
  data.frame() %>% 
  left_join(Z_scoring %>%  mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
                          grepl('RFP', sample) ~ 'RFP',
                          TRUE ~ 'iPSC')) %>% 
              filter(Type=='GFP') %>% group_by(motif_alt_id) %>% summarise(`Z score` = mean(`Z score`))) %>% 
  left_join(gfp_rfp %>% mutate(TF=Gene, log2FC_GFP_RFP_RNASeq = log2FoldChange) %>% select(TF, log2FC_GFP_RFP_RNASeq)) 
results %>% filter((`Z score`) > 2.5) %>% arrange(pvalue) %>% DT::datatable()
```

## Volcano
```{r}
volcano_maker <- function(df, title){
  df$Class <- 'Not significant'
  df$Class[df$padj < 0.01 & df$log2FoldChange > 0.4 & (df$`Z score`) > 2.5] <- "FDR < 0.01 &\nabs(Z score) > 2.5"
  df$Class[df$padj < 0.01 & df$log2FoldChange < -0.4 & (df$`Z score`) > 2.5] <- "FDR < 0.01 &\nabs(Z score) > 2.5"
  df$Class <- factor(df$Class, levels=c('Not significant', "FDR < 0.01 &\nabs(Z score) > 2.5"))
  plot <- ggplot(data=df,aes(x=log2FoldChange,y=-log10(pvalue))) + 
    geom_point(aes(colour=Class, size=`Z score`), alpha=0.5) +
    scale_colour_manual(values=c("gray","darkred")) + 
    geom_text_repel(data=df %>% filter((padj < 0.01 & log2FoldChange > 0.4 & `Z score` > 2.5) | (padj < 0.01 & log2FoldChange < -0.4 & `Z score` > 2.5)), 
                    aes(label=TF)) +
    # geom_vline(aes(xintercept=-0.5),linetype="dotted") +
    # geom_vline(aes(xintercept=0.5),linetype="dotted") +
    scale_x_continuous(breaks=c(-1,-0.5,0,0.5,1)) +
    ggtitle(title) + theme_minimal()
  return(plot)
}
volcano_maker(results, 'GFP vs RFP TFBS motif counts')

```


## Plots of the top motifs different between GFP and RFP
RFP II E is plotted here, but not used in the differential analysis above
```{r, fig.height=5, fig.width=6}
plots <- list()
for (i in results %>% filter((`Z score`) > 2) %>% arrange(pvalue) %>% head(n=12) %>% pull(motif_alt_id)){
  plots[[i]] <- plotter(i, scale=T)
}


plot_grid(plotlist = plots, ncol=3)
```


```{r}
# Gene <-> motif matching
# load gtf to map transcript to gene_name

gtf <- read_tsv("/Volumes/data/genomes/1000G_phase2_GRCh37/gencode.v28lift37.metadata.HGNC.gz", col_names = c('Transcript','Gene'))
# gtf <- gtf %>% rowwise() %>% mutate(transcript = ((str_split(V9, ';')[[1]] %>% str_extract('transcript_id.*') %>% na.omit())[1] %>% str_split(., '\"'))[[1]][2],
#                              gene = ((str_split(V9, ';')[[1]] %>% str_extract('gene_name.*') %>% na.omit())[1] %>% str_split(., '\"'))[[1]][2])
```

```{r}
## Sample data
### Closest TSS for OTX2 (M5700_1.02)

# samples_full = list.files('/Volumes/data/projects/nei/hufnagel/iPSC_RPE_ATAC_Seq/closest_TSS_motifs/', 
#                           pattern = '*dat.gz', full.names = T)
# 
# closestTSS_data <- list()
# for (i in samples_full){
#   sample <- str_split(str_split(i, '/')[[1]][10], '\\.')[[1]][1]
#  # input <- read_tsv(i, col_names = c('sequence_name', 'start', 'end', 'motif', 'fimo_pvalue', 'strand', 'tss_seq','tss_start','tss_end', 'transcript',  'blank','tss_strand','coord1','coord2','blank2','exon_num','size','exon_pos','distance'))
#   input <- fread(paste('gzcat', i, '| grep M5700_1'))
#   colnames(input) <- c('sequence_name', 'start', 'end', 'motif', 'fimo_pvalue', 'strand', 'tss_seq','tss_start','tss_end', 'transcript',  'blank','tss_strand','coord1','coord2','blank2','exon_num','size','exon_pos','distance')
#   input <- input %>% mutate(sequence_name = as.character(sequence_name),
#                             tss_seq = as.character(tss_seq),
#                             coord1=as.numeric(coord1),
#                             coord2=as.numeric(coord2),
#                             blank2 = as.character(blank2),
#                             exon_num=as.integer(exon_num),
#                             size=as.numeric(size)) %>% mutate(sample = sample)
#   closestTSS_data[[sample]] <- input
# }
# sample_closestTSS = bind_rows(closestTSS_data) %>% mutate(sample=as.factor(sample)) %>% rowwise() %>% mutate(Transcript = str_split(transcript, '_')[[1]][1])
# 
# both <- left_join(sample_closestTSS, gtf) %>% 
#   filter(!is.na(Gene)) %>% 
#   mutate(motif_loc = paste(sequence_name, start, end, sep='_'))
# 
# both %>% 
#   # remove any TSS over 500kb away
#   filter(distance < 500000) %>% 
#   #  one gene per motif
#   group_by(motif_loc, sample, Gene) %>% 
#   top_n(1, distance) %>% 
#   ungroup() %>% 
#   # add up to two genes (total) per motif
#   group_by(motif_loc, sample) %>% 
#   top_n(2, distance) %>% 
#   ungroup() %>% 
#   # arrange by genes most linked to motif  
#   group_by(Gene, sample) %>% 
#   summarise(Count=n(), paste(motif_loc, collapse=', ')) %>% 
#   arrange(-Count)
```

```{r}

# What genes have more PAX6 'associated' motifs compared from GFP to RFP

# enriched_genes <- both %>% 
#   # only keep one gene per motif
#   group_by(motif_loc, sample, Gene) %>% 
#   top_n(1, distance) %>% 
#   ungroup() %>% 
#   # keep up to two genes per motif
#   group_by(motif_loc, sample) %>% 
#   top_n(2, distance) %>% 
#   ungroup() %>% 
#   # arrange by genes most linked to motif  
#   group_by(Gene, sample) %>% 
#   summarise(Count=n(), paste(motif_loc, collapse=', ')) %>% 
#   ungroup() %>% 
#   # collapse to GFP/RFP/iPSC
#   mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
#                           grepl('RFP', sample) ~ 'RFP',
#                           TRUE ~ 'iPSC')) %>% 
#   group_by(Gene, Type) %>% 
#   summarise(Total=sum(Count)) %>% 
#   arrange(-Total) %>% 
#   spread(Gene, Total) %>% t() 
# 
# colnames(enriched_genes) <- enriched_genes[1,]
# enriched_genes <- enriched_genes[-1,] %>% data.frame() %>% rownames_to_column('Gene') %>%  mutate(GFP = as.numeric(GFP), iPSC = as.numeric(iPSC), RFP = as.numeric((RFP)))
# 
# enriched_genes[is.na(enriched_genes)] <- 0
# 
# enriched_genes %>% mutate(`deltaGFP <-> RFP` = GFP - RFP) %>% arrange(-`deltaGFP <-> RFP`, GFP) %>% head(1000) %>% DT::datatable(rownames = F)

```


```{r}
## does PAX6 regulate PAX6?
#Yes, yes it does.

#GFP specific!
  
# both %>% 
#   # only keep one gene per motif
#   group_by(motif_loc, sample, Gene) %>% 
#   top_n(1, distance) %>% 
#   ungroup() %>% 
#   # keep up to two genes per motif
#   group_by(motif_loc, sample) %>% 
#   top_n(2, distance) %>% 
#   ungroup() %>% 
#   # arrange by genes most linked to motif  
#   group_by(Gene, sample) %>% 
#   summarise(Count=n(), `Motif Locations` = paste(motif_loc, collapse=', ')) %>% 
#   arrange(-Count) %>% 
#   filter(Gene=='PAX6')
```

```
